import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.0+ KB
train.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
test.describe()
| PassengerId | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
| mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
| std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
| min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
| 50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
| max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
Train dataset has missing values in Age, Cabin and Emarked.
Test dataset has missing values in Age, Cabin and Fare.
pandas_profiling.ProfileReport(train)
def sexId_encode(dataset):
dataset['SexId'] = dataset['Sex'].transform(lambda x: 1 if x == 'male' else 0)
return dataset
train = sexId_encode(train)
test = sexId_encode(test)
train.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | SexId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
The Age and SibSp columns have high negative correlation. We will use the SibSp column to help us fill the missing Age values.
sns.lineplot(data=train.loc[:,['SibSp', 'Age', 'SexId']], x='SibSp', y="Age", hue='SexId')
<AxesSubplot:xlabel='SibSp', ylabel='Age'>
We will find the median age of each SibSp group and create a map.
This age map will be used to fill the missing Age values in both train and test datasets. Note that this map is entirely built on the train dataset.
age_map = train[['SibSp', 'Age']].groupby(['SibSp']).median()
age_map
| Age | |
|---|---|
| SibSp | |
| 0 | 29.0 |
| 1 | 30.0 |
| 2 | 23.0 |
| 3 | 9.5 |
| 4 | 6.5 |
| 5 | 11.0 |
| 8 | NaN |
We found that this map is missing median values for SibSp > 5, so we just take that median for SibSp = 5
age_map.loc[8] = age_map.loc[5]
age_map = age_map.drop([8], axis=0)
age_map
| Age | |
|---|---|
| SibSp | |
| 0 | 29.0 |
| 1 | 30.0 |
| 2 | 23.0 |
| 3 | 9.5 |
| 4 | 6.5 |
| 5 | 11.0 |
def age_fillna(dataset):
SibSp_max = dataset['SibSp'].max()
age_map_max = max(age_map.index)
for i in range(SibSp_max+1):
if i <= age_map_max:
dataset.loc[(dataset['SibSp'] == i) & np.isnan(dataset['Age']) , 'Age'] = float(age_map.loc[i])
else:
dataset.loc[(dataset['SibSp'] == i) & np.isnan(dataset['Age']) , 'Age'] = float(age_map.loc[age_map_max])
return dataset
train = age_fillna(train)
test = age_fillna(test)
print('Any Age value still missing? ', sum(train.Age.isnull()) + sum(test.Age.isnull()) )
Any Age value still missing? 0
Before we encode this column, note that there are a couple of missing values (nan) in the column. We can ignore them and go straight with the encoding step and then we may have a little problem with the "nan" column name in that the column name has the type of float (nan) instead of string. You can call the column with numpy's nan -> df[np.nan]
If you don't like that you can use Pandas' fillna() to fill them first, like what we are doing below.
def embarked_fillna(dataset):
dataset['Embarked'] = dataset['Embarked'].fillna('UNK')
return dataset
train = embarked_fillna(train)
test = embarked_fillna(test)
sum(train['Embarked'].isna())
0
You can also use Pandas' get_dummies() for this task, but you will miss the fit and transform function in sklearn. -> pd.get_dummies(train.Embarked)
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# .values turns the Pandas series into a 1d numpy array
# .reshape(-1,-1) turns it into a 2d array of X rows and 1 column
OH_cols_train = OH_encoder.fit_transform(train['Embarked'].values.reshape(-1,1))
OH_cols_train_pd = pd.DataFrame(OH_cols_train,
columns=OH_encoder.categories_[0].tolist()) # Don't set columns to nested list
# also encode the test dataset
OH_cols_test = OH_encoder.transform(test['Embarked'].values.reshape(-1,1))
OH_cols_test_pd = pd.DataFrame(OH_cols_test,
columns=OH_encoder.categories_[0].tolist())
OH_cols_train_pd
| C | Q | S | UNK | |
|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 0.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... |
| 886 | 0.0 | 0.0 | 1.0 | 0.0 |
| 887 | 0.0 | 0.0 | 1.0 | 0.0 |
| 888 | 0.0 | 0.0 | 1.0 | 0.0 |
| 889 | 1.0 | 0.0 | 0.0 | 0.0 |
| 890 | 0.0 | 1.0 | 0.0 | 0.0 |
891 rows × 4 columns
train = pd.concat([train, OH_cols_train_pd], axis = 1)
test = pd.concat([test, OH_cols_test_pd], axis = 1)
train
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | SexId | C | Q | S | UNK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 30.0 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 1 | 1.0 | 0.0 | 0.0 | 0.0 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
891 rows × 17 columns
def trim_data(dataset):
dataset_trimmed = dataset.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'UNK'], axis=1)
return dataset_trimmed
train_trimmed = trim_data(train)
test_trimmed = trim_data(test)
train_trimmed.head()
| Survived | Pclass | Age | SibSp | Parch | Fare | SexId | C | Q | S | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0.0 | 0.0 | 1.0 |
| 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 1.0 | 0.0 | 0.0 |
| 2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 0 | 0.0 | 0.0 | 1.0 |
| 3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 0 | 0.0 | 0.0 | 1.0 |
| 4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0.0 | 0.0 | 1.0 |
We have filled all missing values in the train dataset but still missing one value in the Test dataset. We will just plug that by the passenger's Pclass.
test_trimmed.isnull().sum()
Pclass 0 Age 0 SibSp 0 Parch 0 Fare 1 SexId 0 C 0 Q 0 S 0 dtype: int64
test_trimmed[test_trimmed['Fare'].isnull()]
| Pclass | Age | SibSp | Parch | Fare | SexId | C | Q | S | |
|---|---|---|---|---|---|---|---|---|---|
| 152 | 3 | 60.5 | 0 | 0 | NaN | 1 | 0.0 | 0.0 | 1.0 |
Pclass_Fare = train_trimmed.groupby('Pclass').agg({'Fare':'median'})
Pclass_Fare
| Fare | |
|---|---|
| Pclass | |
| 1 | 60.2875 |
| 2 | 14.2500 |
| 3 | 8.0500 |
test_trimmed.loc[test_trimmed.Fare.isna(), 'Fare'] = float(Pclass_Fare.loc[3])
test_trimmed.isna().sum()
Pclass 0 Age 0 SibSp 0 Parch 0 Fare 0 SexId 0 C 0 Q 0 S 0 dtype: int64
Pclass_by_Survival = train_trimmed.loc[:,['Survived','Pclass']].groupby(['Survived', 'Pclass']).agg({'Pclass' : 'count'})
Pclass_by_Survival.columns = ['No. of Survivals']
Pclass_by_Survival
| No. of Survivals | ||
|---|---|---|
| Survived | Pclass | |
| 0 | 1 | 80 |
| 2 | 97 | |
| 3 | 372 | |
| 1 | 1 | 136 |
| 2 | 87 | |
| 3 | 119 |
With that we can see the survival rate of each Pclass
Pclass_rate_by_Survival = Pclass_by_Survival.div(Pclass_by_Survival.groupby(level=0).sum())
Pclass_by_Survival.columns = ['Pclass Rate of Survivors']
Pclass_rate_by_Survival
| No. of Survivals | ||
|---|---|---|
| Survived | Pclass | |
| 0 | 1 | 0.145719 |
| 2 | 0.176685 | |
| 3 | 0.677596 | |
| 1 | 1 | 0.397661 |
| 2 | 0.254386 | |
| 3 | 0.347953 |
We can see that the among the survivors (Survived = 1), the highest proportion came from first class even first class passengers were the fewest.
We will do the same for sex.
Survival_by_sex = train_trimmed.loc[:,['SexId','Survived']].groupby(['SexId','Survived']).agg({'SexId' : 'count'})
Survival_rate_by_sex = Survival_by_sex.div(Survival_by_sex.groupby(level=0).sum())
Survival_rate_by_sex.columns = ['Survival rate of each gender']
Survival_rate_by_sex
| Survival rate of each gender | ||
|---|---|---|
| SexId | Survived | |
| 0 | 0 | 0.257962 |
| 1 | 0.742038 | |
| 1 | 0 | 0.811092 |
| 1 | 0.188908 |
Also here we can see that most of the women survived and most the men didn't.
Next we will something similar for embarking ports.
Survival_by_Embarked = train.loc[:,['Embarked','Survived']].groupby(['Embarked','Survived']).agg({'Embarked' : 'count'})
Survival_rate_by_Embarked = Survival_by_Embarked.div(Survival_by_Embarked.groupby(level=0).sum())
Survival_rate_by_Embarked.columns = ['Survival rate of each embarking port']
Survival_rate_by_Embarked
| Survival rate of each embarking port | ||
|---|---|---|
| Embarked | Survived | |
| C | 0 | 0.446429 |
| 1 | 0.553571 | |
| Q | 0 | 0.610390 |
| 1 | 0.389610 | |
| S | 0 | 0.663043 |
| 1 | 0.336957 | |
| UNK | 1 | 1.000000 |
Then we see the C port has higher survival rate. A reasonable guess would be it is where first class passsengers got onboarded.
pandas_profiling.ProfileReport(train_trimmed)
from sklearn.covariance import empirical_covariance
empirical_covariance(train_trimmed)
array([[ 2.36506479e-01, -1.37548323e-01, -3.40851047e-01,
-1.89323085e-02, 3.19808636e-02, 6.21480390e+00,
-1.26234284e-01, 3.20035371e-02, 4.98815314e-04,
-3.38854312e-02],
[-1.37548323e-01, 6.98230591e-01, -3.80367114e+00,
7.65127267e-02, 1.24149589e-02, -2.28045731e+01,
5.26527275e-02, -7.95194746e-02, 5.18906486e-02,
3.05662939e-02],
[-3.40851047e-01, -3.80367114e+00, 1.73541480e+02,
-4.62754242e+00, -2.12656315e+00, 5.60678399e+01,
5.52571330e-01, 2.02869397e-01, -6.25138906e-02,
-1.86729787e-01],
[-1.89323085e-02, 7.65127267e-02, -4.62754242e+00,
1.21467827e+00, 3.68324724e-01, 8.73891480e+00,
-6.03541337e-02, -2.56625363e-02, -8.16117277e-03,
3.49976886e-02],
[ 3.19808636e-02, 1.24149589e-02, -2.12656315e+00,
3.68324724e-01, 6.48999031e-01, 8.65133108e+00,
-9.44776358e-02, -3.48792829e-03, -1.83868867e-02,
2.27313665e-02],
[ 6.21480390e+00, -2.28045731e+01, 5.60678399e+01,
8.73891480e+00, 8.65133108e+00, 2.46666531e+03,
-4.32608473e+00, 5.23231118e+00, -1.63576848e+00,
-3.70382843e+00],
[-1.26234284e-01, 5.26527275e-02, 5.52571330e-01,
-6.03541337e-02, -9.44776358e-02, -4.32608473e+00,
2.28218083e-01, -1.54821692e-02, -9.94859431e-03,
2.68843819e-02],
[ 3.20035371e-02, -7.95194746e-02, 2.02869397e-01,
-2.56625363e-02, -3.48792829e-03, 5.23231118e+00,
-1.54821692e-02, 1.53000261e-01, -1.62946336e-02,
-1.36282390e-01],
[ 4.98815314e-04, 5.18906486e-02, -6.25138906e-02,
-8.16117277e-03, -1.83868867e-02, -1.63576848e+00,
-9.94859431e-03, -1.62946336e-02, 7.89513794e-02,
-6.24627621e-02],
[-3.38854312e-02, 3.05662939e-02, -1.86729787e-01,
3.49976886e-02, 2.27313665e-02, -3.70382843e+00,
2.68843819e-02, -1.36282390e-01, -6.24627621e-02,
2.00367561e-01]])
from sklearn import preprocessing
X_train = train_trimmed.drop(['Survived'], axis=1)
X_test = test_trimmed
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = train_trimmed['Survived']
We will first standardize and scale the data first and then train the Logistic Regression model.
from sklearn.linear_model import LogisticRegression
clf_logistic = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
Try predicting the train dataset and show the probability
clf_logistic.predict(X_train_scaled[:2])
array([0, 1], dtype=int64)
clf_logistic.predict_proba(X_train_scaled[:2])
array([[0.91347504, 0.08652496],
[0.0802736 , 0.9197264 ]])
clf_logistic.score(X_train_scaled, y_train)
0.8013468013468014
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, clf_logistic.predict(X_train_scaled))
array([[475, 74],
[103, 239]], dtype=int64)
from sklearn.metrics import classification_report
print(classification_report(y_train, clf_logistic.predict(X_train_scaled)))
precision recall f1-score support
0 0.82 0.87 0.84 549
1 0.76 0.70 0.73 342
accuracy 0.80 891
macro avg 0.79 0.78 0.79 891
weighted avg 0.80 0.80 0.80 891
Try predicting the test dataset and show the probability
test_result_logistic = clf_logistic.predict(X_test_scaled)
test_result_logistic[:10]
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)
clf_logistic.predict_proba(X_test_scaled[:10])
array([[0.8916186 , 0.1083814 ],
[0.64524943, 0.35475057],
[0.88625344, 0.11374656],
[0.89972525, 0.10027475],
[0.43550033, 0.56449967],
[0.84669719, 0.15330281],
[0.32085859, 0.67914141],
[0.81314057, 0.18685943],
[0.22302484, 0.77697516],
[0.93280895, 0.06719105]])
from sklearn import svm
clf_svc = svm.SVC().fit(X_train_scaled, y_train)
clf_svc.predict(X_train_scaled[:2]) #SVMs do not directly provide probability estimates
array([0, 1], dtype=int64)
clf_svc.score(X_train_scaled, y_train)
0.8439955106621774
confusion_matrix(y_train, clf_svc.predict(X_train_scaled))
array([[522, 27],
[112, 230]], dtype=int64)
print(classification_report(y_train, clf_svc.predict(X_train_scaled)))
precision recall f1-score support
0 0.82 0.95 0.88 549
1 0.89 0.67 0.77 342
accuracy 0.84 891
macro avg 0.86 0.81 0.83 891
weighted avg 0.85 0.84 0.84 891
from sklearn.linear_model import RidgeClassifier
clf_ridge = RidgeClassifier().fit(X_train_scaled, y_train)
clf_ridge.score(X_train_scaled, y_train)
0.7934904601571269
confusion_matrix(y_train, clf_ridge.predict(X_train_scaled))
array([[468, 81],
[103, 239]], dtype=int64)
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=2).fit(X_train_scaled, y_train)
clf_knn.score(X_train_scaled, y_train)
0.8832772166105499
confusion_matrix(y_train, clf_knn.predict(X_train_scaled))
array([[548, 1],
[103, 239]], dtype=int64)
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(random_state=42).fit(X_train_scaled, y_train)
clf_knn_post_nca = KNeighborsClassifier(n_neighbors=2).fit(nca.transform(X_train_scaled), y_train)
clf_knn_post_nca.score(nca.transform(X_train_scaled), y_train)
0.8922558922558923
confusion_matrix(y_train, clf_knn_post_nca.predict(nca.transform(X_train_scaled)))
array([[548, 1],
[ 95, 247]], dtype=int64)
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)
clf_tree.score(X_train_scaled, y_train)
0.9764309764309764
from sklearn.ensemble import RandomForestClassifier
clf_rf_5 = RandomForestClassifier(n_estimators=5).fit(X_train_scaled, y_train)
clf_rf_5.score(X_train_scaled, y_train)
0.9461279461279462
from sklearn.ensemble import RandomForestClassifier
clf_rf_10 = RandomForestClassifier(n_estimators=10).fit(X_train_scaled, y_train)
clf_rf_10.score(X_train_scaled, y_train)
0.9629629629629629
clf_rf_50 = RandomForestClassifier(n_estimators=50).fit(X_train_scaled, y_train)
clf_rf_50.score(X_train_scaled, y_train)
0.9764309764309764
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(n_estimators=100).fit(X_train_scaled, y_train)
clf_ada.score(X_train_scaled, y_train)
0.8462401795735129
from sklearn.ensemble import GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(
n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
).fit(X_train_scaled, y_train)
clf_gb.score(X_train_scaled, y_train)
0.8552188552188552
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
hidden_layer_sizes=(5, 2), max_iter=800, random_state=1).fit(X_train_scaled, y_train)
clf_mlp.score(X_train_scaled, y_train)
0.8428731762065096
from sklearn.ensemble import VotingClassifier
clf_voting = VotingClassifier(
estimators=[('lr', clf_logistic), ('svc', clf_svc), ('rc', clf_ridge), ('rf', clf_rf_10), ('knn', clf_knn)],
voting='hard').fit(X_train_scaled, y_train) # soft voting is based on predicted probabilities
from sklearn.metrics import classification_report
print(classification_report(y_train, clf_voting.predict(X_train_scaled)))
precision recall f1-score support
0 0.85 0.95 0.90 549
1 0.90 0.74 0.81 342
accuracy 0.87 891
macro avg 0.88 0.84 0.86 891
weighted avg 0.87 0.87 0.87 891
confusion_matrix(y_train, clf_voting.predict(X_train_scaled))
array([[522, 27],
[ 90, 252]], dtype=int64)
test_result_voting = clf_voting.predict(X_test_scaled)
test_result_voting[:10]
array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf_logistic, X_train_scaled, y_train, cv=5)
print("Logisitic regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores = cross_val_score(clf_svc, X_train_scaled, y_train, cv=5)
print("SVC: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores = cross_val_score(clf_ridge, X_train_scaled, y_train, cv=5)
print("Ridge: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores = cross_val_score(clf_knn, X_train_scaled, y_train, cv=5)
print("KNN: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores = cross_val_score(clf_knn_post_nca, nca.transform(X_train_scaled), y_train, cv=5)
print("KNN with NCA: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_tree, X_train_scaled, y_train, cv=5)
print("Tree: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_rf_5, X_train_scaled, y_train, cv=5)
print("Random Forest 5: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_rf_10, X_train_scaled, y_train, cv=5)
print("Random Forest 10: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_rf_50, X_train_scaled, y_train, cv=5)
print("Random Forest 50: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_ada, X_train_scaled, y_train, cv=5)
print("AdaBoost: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_gb, X_train_scaled, y_train, cv=5)
print("Gradient Boosting: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_mlp, X_train_scaled, y_train, cv=5)
print("MLP: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
scores =cross_val_score(clf_voting, X_train_scaled, y_train, cv=5)
print("Voting (Hard): %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
Logisitic regression: 0.79 accuracy with a standard deviation of 0.02 SVC: 0.82 accuracy with a standard deviation of 0.02 Ridge: 0.79 accuracy with a standard deviation of 0.02 KNN: 0.78 accuracy with a standard deviation of 0.03 KNN with NCA: 0.80 accuracy with a standard deviation of 0.03 Tree: 0.78 accuracy with a standard deviation of 0.03 Random Forest 5: 0.80 accuracy with a standard deviation of 0.03 Random Forest 10: 0.80 accuracy with a standard deviation of 0.03 Random Forest 50: 0.81 accuracy with a standard deviation of 0.02 AdaBoost: 0.81 accuracy with a standard deviation of 0.02 Gradient Boosting: 0.82 accuracy with a standard deviation of 0.02 MLP: 0.81 accuracy with a standard deviation of 0.03 Voting (Hard): 0.82 accuracy with a standard deviation of 0.02
from sklearn.decomposition import PCA
pca = PCA(n_components='mle').fit(X_train_scaled)
print(pca.explained_variance_ratio_)
X_train_scaled_pca = pca.transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)
[0.24051524 0.20123658 0.18300588 0.1135156 0.09011405 0.07295949 0.05863515 0.03945013]
X_train_scaled_pca.shape
(891, 8)
clf_voting_pca = VotingClassifier(
estimators=[('lr', clf_logistic), ('svc', clf_svc), ('rc', clf_ridge), ('rf', clf_rf_10), ('knn', clf_knn)],
voting='hard').fit(X_train_scaled_pca, y_train)
print(classification_report(y_train, clf_voting_pca.predict(X_train_scaled_pca)))
precision recall f1-score support
0 0.85 0.95 0.90 549
1 0.90 0.74 0.81 342
accuracy 0.87 891
macro avg 0.88 0.85 0.86 891
weighted avg 0.87 0.87 0.87 891
confusion_matrix(y_train, clf_voting_pca.predict(X_train_scaled_pca))
array([[522, 27],
[ 89, 253]], dtype=int64)
test_result_voting_pca = clf_voting_pca.predict(X_test_scaled_pca)
submission_df = pd.DataFrame([test['PassengerId'].tolist(), test_result_voting_pca.tolist()], index=['PassengerId', 'Survived']).T
submission_df
| PassengerId | Survived | |
|---|---|---|
| 0 | 892 | 0 |
| 1 | 893 | 0 |
| 2 | 894 | 0 |
| 3 | 895 | 0 |
| 4 | 896 | 0 |
| ... | ... | ... |
| 413 | 1305 | 0 |
| 414 | 1306 | 1 |
| 415 | 1307 | 0 |
| 416 | 1308 | 0 |
| 417 | 1309 | 0 |
418 rows × 2 columns
submission_df.to_csv('submission.csv', index=False)
After submitting 3 results files to Kaggle, i.e. results from logistic regression, voting classification, and also voting classification but with PCA transformed data, we found that although voting classification (without PCA) yields the best result, all 3 scores were only slightly different.
So much of the maggic with machine learning still lies in feature engineering. And this is what we will continue to explore.